{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "view-in-github"
   },
   "source": [
    "<a href=\"https://colab.research.google.com/github/tomasonjo/blogs/blob/master/llm/openaifunction_constructing_graph.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "cllpFMqCUaA9",
    "outputId": "7df89ff8-292c-4825-8f82-2e90e5827801"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: langchain in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (0.1.0)\n",
      "Requirement already satisfied: neo4j in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (5.16.0)\n",
      "Requirement already satisfied: openai in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (1.6.1)\n",
      "Requirement already satisfied: wikipedia in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (1.4.0)\n",
      "Requirement already satisfied: tiktoken in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (0.5.2)\n",
      "Requirement already satisfied: langchain_openai in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (0.0.2.post1)\n",
      "Requirement already satisfied: PyYAML>=5.3 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from langchain) (6.0.1)\n",
      "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from langchain) (2.0.21)\n",
      "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from langchain) (3.9.0)\n",
      "Requirement already satisfied: dataclasses-json<0.7,>=0.5.7 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from langchain) (0.6.3)\n",
      "Requirement already satisfied: jsonpatch<2.0,>=1.33 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from langchain) (1.33)\n",
      "Requirement already satisfied: langchain-community<0.1,>=0.0.9 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from langchain) (0.0.9)\n",
      "Requirement already satisfied: langchain-core<0.2,>=0.1.7 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from langchain) (0.1.7)\n",
      "Requirement already satisfied: langsmith<0.1.0,>=0.0.77 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from langchain) (0.0.77)\n",
      "Requirement already satisfied: numpy<2,>=1 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from langchain) (1.26.2)\n",
      "Requirement already satisfied: pydantic<3,>=1 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from langchain) (1.10.12)\n",
      "Requirement already satisfied: requests<3,>=2 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from langchain) (2.31.0)\n",
      "Requirement already satisfied: tenacity<9.0.0,>=8.1.0 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from langchain) (8.2.2)\n",
      "Requirement already satisfied: pytz in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from neo4j) (2023.3.post1)\n",
      "Requirement already satisfied: anyio<5,>=3.5.0 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from openai) (3.5.0)\n",
      "Requirement already satisfied: distro<2,>=1.7.0 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from openai) (1.8.0)\n",
      "Requirement already satisfied: httpx<1,>=0.23.0 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from openai) (0.26.0)\n",
      "Requirement already satisfied: sniffio in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from openai) (1.2.0)\n",
      "Requirement already satisfied: tqdm>4 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from openai) (4.65.0)\n",
      "Requirement already satisfied: typing-extensions<5,>=4.7 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from openai) (4.9.0)\n",
      "Requirement already satisfied: beautifulsoup4 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from wikipedia) (4.12.2)\n",
      "Requirement already satisfied: regex>=2022.1.18 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from tiktoken) (2023.10.3)\n",
      "Requirement already satisfied: attrs>=17.3.0 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (23.1.0)\n",
      "Requirement already satisfied: multidict<7.0,>=4.5 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.0.4)\n",
      "Requirement already satisfied: yarl<2.0,>=1.0 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.9.3)\n",
      "Requirement already satisfied: frozenlist>=1.1.1 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.4.0)\n",
      "Requirement already satisfied: aiosignal>=1.1.2 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.2.0)\n",
      "Requirement already satisfied: idna>=2.8 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from anyio<5,>=3.5.0->openai) (3.4)\n",
      "Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from dataclasses-json<0.7,>=0.5.7->langchain) (3.20.1)\n",
      "Requirement already satisfied: typing-inspect<1,>=0.4.0 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from dataclasses-json<0.7,>=0.5.7->langchain) (0.9.0)\n",
      "Requirement already satisfied: certifi in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from httpx<1,>=0.23.0->openai) (2023.11.17)\n",
      "Requirement already satisfied: httpcore==1.* in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from httpx<1,>=0.23.0->openai) (1.0.2)\n",
      "Requirement already satisfied: h11<0.15,>=0.13 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from httpcore==1.*->httpx<1,>=0.23.0->openai) (0.14.0)\n",
      "Requirement already satisfied: jsonpointer>=1.9 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from jsonpatch<2.0,>=1.33->langchain) (2.1)\n",
      "Requirement already satisfied: packaging<24.0,>=23.2 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from langchain-core<0.2,>=0.1.7->langchain) (23.2)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from requests<3,>=2->langchain) (2.0.4)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from requests<3,>=2->langchain) (2.1.0)\n",
      "Requirement already satisfied: soupsieve>1.2 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from beautifulsoup4->wikipedia) (2.5)\n",
      "Requirement already satisfied: mypy-extensions>=0.3.0 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain) (1.0.0)\n"
     ]
    }
   ],
   "source": [
    "!pip install langchain neo4j openai wikipedia tiktoken langchain_openai"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "6puFOim9N5RZ"
   },
   "source": [
    "# Constructing knowledge graphs from text using OpenAI functions\n",
    "## Seamlessy implement information extraction pipeline with LangChain and Neo4j\n",
    "Extracting structured information from unstructured data like text has been around for some time and is nothing new. However, LLMs brought a significant shift to the field of information extraction. If before you needed a team of machine learning experts to curate datasets and train custom models, you only need access to an LLM nowadays. The barrier to entry has dropped significantly, making what was just a couple of years ago reserved for domain experts more accessible to even non-technical people.\n",
    "\n",
    "![1_PFUAMubUijcM3ybXaK793g.webp]()\n",
    "\n",
    "The image depicts the transformation of unstructured text into structured information. This process, labeled as the information extraction pipeline, results in a graph representation of information. The nodes represent key entities, while the connecting lines denote the relationships between these entities. Knowledge graphs are useful for [multi-hop question-answering](https://medium.com/neo4j/knowledge-graphs-llms-multi-hop-question-answering-322113f53f51), [real-time analytics](https://medium.com/neo4j/knowledge-graphs-llms-real-time-graph-analytics-89b392eaaa95), or when you want to [combine structured and unstructured data in a single database](https://blog.langchain.dev/using-a-knowledge-graph-to-implement-a-devops-rag-application/).\n",
    "\n",
    "While extracting structured information from text has been made more accessible due to LLMs, it is by no means a solved problem. In this blog post, we will use OpenAI functions in combination with LangChain to construct a knowledge graph from a sample Wikipedia page. Along the way, we will discuss best practices as well as some limitations of current LLMs.\n",
    "\n",
    "# Neo4j Environment setup\n",
    "You need to setup a Neo4j to follow along with the examples in this blog post. The easiest way is to start a free instance on Neo4j Aura, which offers cloud instances of Neo4j database. Alternatively, you can also setup a local instance of the Neo4j database by downloading the Neo4j Desktop application and creating a local database instance.\n",
    "\n",
    "The following code will instantiate a LangChain wrapper to connect to Neo4j Database."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "id": "F25P-N_DmEBK"
   },
   "outputs": [],
   "source": [
    "from langchain.graphs import Neo4jGraph\n",
    "\n",
    "url = \"bolt://54.236.226.158:7687\"\n",
    "username =\"neo4j\"\n",
    "password = \"radiuses-investment-college\"\n",
    "graph = Neo4jGraph(\n",
    "    url=url,\n",
    "    username=username,\n",
    "    password=password\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "rzVAN8UgO5Q0"
   },
   "source": [
    "## Information extraction pipeline\n",
    "A typical information extraction pipeline contains the following steps.\n",
    "\n",
    "![1_93ZK9-74dYv4eXY-Oe_bkA.png]()\n",
    "\n",
    "In the first step, we run the input text through a coreference resolution model. The coreference resolution is the task of finding all expressions that refer to a specific entity. Simply put, it links all the pronouns to the referred entity. In the named entity recognition part of the pipeline, we try to extract all the mentioned entities. The above example contains three entities: Tomaz, Blog, and Diagram. The next step is the entity disambiguation step, an essential but often overlooked part of an information extraction pipeline. Entity disambiguation is the process of accurately identifying and distinguishing between entities with similar names or references to ensure the correct entity is recognized in a given context. In the last step, the model tried to identify various relationships between entities. For example, it could locate the LIKES relationship between Tomaz and Blog entities.\n",
    "## Extracting structured information with OpenAI functions\n",
    "OpenAI functions are a great fit to extract structured information from natural language. The idea behind OpenAI functions is to have an LLM output a predefined JSON object with populated values. The predefined JSON object can be used as input to other functions in so-called RAG applications, or it can be used to extract predefined structured information from text.\n",
    "\n",
    "In LangChain, you can pass a Pydantic class as description of the desired JSON object of the OpenAI functions feature. Therefore, we will start by defining the desired structure of information we want to extract from text. LangChain already has definitions of nodes and relationship as Pydantic classes that we can reuse.\n",
    "\n",
    "Unfortunately, it turns out that OpenAI functions don't currently support a dictionary object as a value. Therefore, we have to overwrite the properties definition to adhere to the limitations of the functions' endpoint."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "id": "K589QK8WUcPo"
   },
   "outputs": [],
   "source": [
    "from langchain_community.graphs.graph_document import (\n",
    "    Node as BaseNode,\n",
    "    Relationship as BaseRelationship,\n",
    "    GraphDocument,\n",
    ")\n",
    "from langchain.schema import Document\n",
    "from typing import List, Dict, Any, Optional\n",
    "from langchain.pydantic_v1 import Field, BaseModel\n",
    "\n",
    "class Property(BaseModel):\n",
    "  \"\"\"A single property consisting of key and value\"\"\"\n",
    "  key: str = Field(..., description=\"key\")\n",
    "  value: str = Field(..., description=\"value\")\n",
    "\n",
    "class Node(BaseNode):\n",
    "    properties: Optional[List[Property]] = Field(\n",
    "        None, description=\"List of node properties\")\n",
    "\n",
    "class Relationship(BaseRelationship):\n",
    "    properties: Optional[List[Property]] = Field(\n",
    "        None, description=\"List of relationship properties\"\n",
    "    )\n",
    "\n",
    "class KnowledgeGraph(BaseModel):\n",
    "    \"\"\"Generate a knowledge graph with entities and relationships.\"\"\"\n",
    "    nodes: List[Node] = Field(\n",
    "        ..., description=\"List of nodes in the knowledge graph\")\n",
    "    rels: List[Relationship] = Field(\n",
    "        ..., description=\"List of relationships in the knowledge graph\"\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "mp1IH_J5PSyD"
   },
   "source": [
    "Here, we have overwritten the properties value to be a list of Property classes instead of a dictionary to overcome the limitations of the API. Because you can only pass a single object to the API, we can to combine the nodes and relationships in a single class called KnowledgeGraph."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "id": "-bV8nHwimqS7"
   },
   "outputs": [],
   "source": [
    "def format_property_key(s: str) -> str:\n",
    "    words = s.split()\n",
    "    if not words:\n",
    "        return s\n",
    "    first_word = words[0].lower()\n",
    "    capitalized_words = [word.capitalize() for word in words[1:]]\n",
    "    return \"\".join([first_word] + capitalized_words)\n",
    "\n",
    "def props_to_dict(props) -> dict:\n",
    "    \"\"\"Convert properties to a dictionary.\"\"\"\n",
    "    properties = {}\n",
    "    if not props:\n",
    "      return properties\n",
    "    for p in props:\n",
    "        properties[format_property_key(p.key)] = p.value\n",
    "    return properties\n",
    "\n",
    "def map_to_base_node(node: Node) -> BaseNode:\n",
    "    \"\"\"Map the KnowledgeGraph Node to the base Node.\"\"\"\n",
    "    properties = props_to_dict(node.properties) if node.properties else {}\n",
    "    # Add name property for better Cypher statement generation\n",
    "    properties[\"name\"] = node.id.title()\n",
    "    return BaseNode(\n",
    "        id=node.id.title(), type=node.type.capitalize(), properties=properties\n",
    "    )\n",
    "\n",
    "\n",
    "def map_to_base_relationship(rel: Relationship) -> BaseRelationship:\n",
    "    \"\"\"Map the KnowledgeGraph Relationship to the base Relationship.\"\"\"\n",
    "    source = map_to_base_node(rel.source)\n",
    "    target = map_to_base_node(rel.target)\n",
    "    properties = props_to_dict(rel.properties) if rel.properties else {}\n",
    "    return BaseRelationship(\n",
    "        source=source, target=target, type=rel.type, properties=properties\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "B-66HJMcPekG"
   },
   "source": [
    "The only thing left is to do a bit of prompt engineering and we are good to go. How I usually go about prompt engineering is the following:\n",
    "* Iterate over prompt and improve results using natural language\n",
    "* If something doesn't work as intended, ask ChatGPT to make it clearer for an LLM to understand the task\n",
    "* Finally, when the prompt has all the instructions needed, ask ChatGPT to summarize the instructions in a markdown format, saving on tokens and perhaps having more clear instructions\n",
    "\n",
    "I specifically chose the markdown format as I have seen somewhere that OpenAI models respond better to markdown syntax in prompts, and it seems to be at least plausible from my experience.\n",
    "Iterating over prompt engineering, I came up with the following system prompt for an information extraction pipeline."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "id": "kpGkdMyuUmAn"
   },
   "outputs": [],
   "source": [
    "import os\n",
    "from langchain.chains.openai_functions import (\n",
    "    create_openai_fn_chain,\n",
    "    create_structured_output_chain,\n",
    ")\n",
    "from langchain_openai import ChatOpenAI\n",
    "from langchain.prompts import ChatPromptTemplate\n",
    "\n",
    "os.environ[\"OPENAI_API_KEY\"] = \"sk-\"\n",
    "llm = ChatOpenAI(model=\"gpt-3.5-turbo-16k\", temperature=0)\n",
    "\n",
    "def get_extraction_chain(\n",
    "    allowed_nodes: Optional[List[str]] = None,\n",
    "    allowed_rels: Optional[List[str]] = None\n",
    "    ):\n",
    "    prompt = ChatPromptTemplate.from_messages(\n",
    "        [(\n",
    "          \"system\",\n",
    "          f\"\"\"# Knowledge Graph Instructions for GPT-4\n",
    "## 1. Overview\n",
    "You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.\n",
    "- **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.\n",
    "- The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.\n",
    "## 2. Labeling Nodes\n",
    "- **Consistency**: Ensure you use basic or elementary types for node labels.\n",
    "  - For example, when you identify an entity representing a person, always label it as **\"person\"**. Avoid using more specific terms like \"mathematician\" or \"scientist\".\n",
    "- **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.\n",
    "{'- **Allowed Node Labels:**' + \", \".join(allowed_nodes) if allowed_nodes else \"\"}\n",
    "{'- **Allowed Relationship Types**:' + \", \".join(allowed_rels) if allowed_rels else \"\"}\n",
    "## 3. Handling Numerical Data and Dates\n",
    "- Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.\n",
    "- **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.\n",
    "- **Property Format**: Properties must be in a key-value format.\n",
    "- **Quotation Marks**: Never use escaped single or double quotes within property values.\n",
    "- **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.\n",
    "## 4. Coreference Resolution\n",
    "- **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.\n",
    "If an entity, such as \"John Doe\", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., \"Joe\", \"he\"),\n",
    "always use the most complete identifier for that entity throughout the knowledge graph. In this example, use \"John Doe\" as the entity ID.\n",
    "Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.\n",
    "## 5. Strict Compliance\n",
    "Adhere to the rules strictly. Non-compliance will result in termination.\n",
    "          \"\"\"),\n",
    "            (\"human\", \"Use the given format to extract information from the following input: {input}\"),\n",
    "            (\"human\", \"Tip: Make sure to answer in the correct format\"),\n",
    "        ])\n",
    "    return create_structured_output_chain(KnowledgeGraph, llm, prompt, verbose=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "qtpkiUmePohB"
   },
   "source": [
    "You can see that we are using the 16k version of the GPT-3.5 model. The main reason is that the OpenAI function output is a structured JSON object, and structured JSON syntax adds a lot of token overhead to the result. Essentially, you are paying for the convenience of structured output in increased token space.\n",
    "\n",
    "Besides the general instructions, I have also added the option to limit which node or relationship types should be extracted from text. You'll see through examples why this might come in handy.\n",
    "We have the Neo4j connection and LLM prompt ready, which means we can define the information extraction pipeline as a single function."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "id": "Gu7aaanyU2NW"
   },
   "outputs": [],
   "source": [
    "def extract_and_store_graph(\n",
    "    document: Document,\n",
    "    nodes:Optional[List[str]] = None,\n",
    "    rels:Optional[List[str]]=None) -> None:\n",
    "    # Extract graph data using OpenAI functions\n",
    "    extract_chain = get_extraction_chain(nodes, rels)\n",
    "    data = extract_chain.invoke(document.page_content)['function']\n",
    "    # Construct a graph document\n",
    "    graph_document = GraphDocument(\n",
    "      nodes = [map_to_base_node(node) for node in data.nodes],\n",
    "      relationships = [map_to_base_relationship(rel) for rel in data.rels],\n",
    "      source = document\n",
    "    )\n",
    "    # Store information into a graph\n",
    "    graph.add_graph_documents([graph_document])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "Je_UuAIYPsc5"
   },
   "source": [
    "The function takes in a LangChain document as well as optional nodes and relationship parameters, which are used to limit the types of objects we want the LLM to identify and extract. A month or so ago, we added the add_graph_documents method the Neo4j graph object, which we can utilize here to seamlessly import the graph.\n",
    "## Evaluation\n",
    "We will extract information from the Walt Disney Wikipedia page and construct a knowledge graph to test the pipeline. Here, we will utilize the Wikipedia loader and text chunking modules provided by LangChain."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "id": "wCvmpkhnyL0R"
   },
   "outputs": [],
   "source": [
    "from langchain.document_loaders import WikipediaLoader\n",
    "from langchain.text_splitter import TokenTextSplitter\n",
    "\n",
    "# Read the wikipedia article\n",
    "raw_documents = WikipediaLoader(query=\"Walt Disney\").load()\n",
    "# Define chunking strategy\n",
    "text_splitter = TokenTextSplitter(chunk_size=2048, chunk_overlap=24)\n",
    "\n",
    "# Only take the first the raw_documents\n",
    "documents = text_splitter.split_documents(raw_documents[:3])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "a1snZNAvPwC3"
   },
   "source": [
    "You might have noticed that we use a relatively large chunk_size value. The reason is that we want to provide as much context as possible around a single sentence in order for the coreference resolution part to work as best as possible. Remember, the coreference step will only work if the entity and its reference appear in the same chunk; otherwise, the LLM doesn't have enough information to link the two.\n",
    "\n",
    "Now we can go ahead and run the documents through the information extraction pipeline."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "CULLN4VMU_Ou",
    "outputId": "1171611b-c820-4f7f-aac4-8a44040d0a9f"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 3/3 [01:54<00:00, 38.18s/it]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "for i, d in tqdm(enumerate(documents), total=len(documents)):\n",
    "    extract_and_store_graph(d)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "-BwMNjWMPyR-"
   },
   "source": [
    "The process takes around 5 minutes, which is relatively slow. Therefore, you would probably want parallel API calls in production to deal with this problem and achieve some sort of scalability.\n",
    "Let's first look at the types of nodes and relationships the LLM identified.\n",
    "\n",
    "![Screenshot from 2023-10-18 11-05-33.png]()\n",
    "\n",
    "Since the graph schema is not provided, the LLM decides on the fly what types of node labels and relationship types it will use. For example, we can observe that there are Company and Organization node labels. Those two things are probably semantically similar or identical, so we would want to have only a single node label representing the two. This problem is more obvious with relationship types. For example, we have CO-FOUNDER and COFOUNDEROF relationships as well as DEVELOPER and DEVELOPEDBY.\n",
    "For any more serious project, you should define the node labels and relationship types the LLM should extract. Luckily, we have added the option to limit the types in the prompt by passing additional parameters."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "5wIIu5DJQVKK",
    "outputId": "d153bca2-a88a-44af-80c8-52410c6bd152"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Delete the graph\n",
    "graph.query(\"MATCH (n) DETACH DELETE n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "shS8aPGAvZgT",
    "outputId": "c087187f-740e-45ea-960e-414420a301f5"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 3/3 [04:03<00:00, 81.18s/it] \n"
     ]
    }
   ],
   "source": [
    "# Specify which node labels should be extracted by the LLM\n",
    "allowed_nodes = [\"Person\", \"Company\", \"Location\", \"Event\", \"Movie\", \"Service\", \"Award\"]\n",
    "\n",
    "for i, d in tqdm(enumerate(documents), total=len(documents)):\n",
    "    extract_and_store_graph(d, allowed_nodes)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "S6HTx46DQANv"
   },
   "source": [
    "In this example, I have only limited the node labels, but you can easily limit the relationship types by passing another parameter to the `extract_and_store_graph` function.\n",
    "\n",
    "The graph turned out better than expected (after five iterations :) ). I couldn't catch the whole graph nicely in the visualization, but you can explore it on your own in Neo4j Browser other tools.\n",
    "## Entity disambiguation\n",
    "One thing I should mention is that we partly skipped entity disambiguation part. We used a large chunk size and added a specific instruction for coreference resolution and entity disambiguation in the system prompt. However, since each chunk is processed separately, there is no way to ensure consistency of entities between different text chunks. For example, you could end up with two nodes representing the same person.\n",
    "\n",
    "![Screenshot from 2023-10-18 11-35-38.png]()\n",
    "\n",
    "In this example, Walt Disney and Walter Elias Disney refer to the same real-world person. The entity disambiguation problem is nothing new and there has been various solution proposed to solve it:\n",
    "* Using [entity linking](https://wikifier.org/about.html) or [entity disambiguation NLP models](https://github.com/SapienzaNLP/extend)\n",
    "* Doing a [second pass through an LLM and asking it to perform entity disambiguation](https://medium.com/neo4j/creating-a-knowledge-graph-from-video-transcripts-with-gpt-4-52d7c7b9f32c)\n",
    "* [Graph-based approaches](https://neo4j.com/developer-blog/exploring-supervised-entity-resolution-in-neo4j/)\n",
    "\n",
    "Which solution you should use depends on your domain and use case. However, have in mind that entity disambiguation step should not be overlooked as it can have a significant impact on the accuracy and effectiveness of your RAG applications.\n",
    "## Rag Application\n",
    "The last thing we will do is show you how you can browse information in a knowledge graph by constructing Cypher statements. Cypher is a structured query language used to work with graph databases, similar to how SQL is used for relational databases. LangChain has a [GraphCypherQAChain](https://medium.com/neo4j/langchain-cypher-search-tips-tricks-f7c9e9abca4d) that reads the schema of the graph and constructs appropriate Cypher statements based on the user input."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "id": "8EeZG9I8JYGJ"
   },
   "outputs": [],
   "source": [
    "# Query the knowledge graph in a RAG application\n",
    "from langchain.chains import GraphCypherQAChain\n",
    "\n",
    "graph.refresh_schema()\n",
    "\n",
    "cypher_chain = GraphCypherQAChain.from_llm(\n",
    "    graph=graph,\n",
    "    cypher_llm=ChatOpenAI(temperature=0, model=\"gpt-4\"),\n",
    "    qa_llm=ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo\"),\n",
    "    validate_cypher=True, # Validate relationship directions\n",
    "    verbose=True\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 192
    },
    "id": "0-qaxjC2NOJm",
    "outputId": "11a0471d-8a4b-44f6-9e03-63cba2b90b23"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/tomazbratanic/anaconda3/lib/python3.11/site-packages/langchain_core/_api/deprecation.py:189: LangChainDeprecationWarning: The function `run` was deprecated in LangChain 0.1.0 and will be removed in 0.2.0. Use invoke instead.\n",
      "  warn_deprecated(\n",
      "/Users/tomazbratanic/anaconda3/lib/python3.11/site-packages/langchain_core/_api/deprecation.py:189: LangChainDeprecationWarning: The function `__call__` was deprecated in LangChain 0.1.0 and will be removed in 0.2.0. Use invoke instead.\n",
      "  warn_deprecated(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generated Cypher:\n",
      "\u001b[32;1m\u001b[1;3mMATCH (p:Person {name: \"Walter Elias Disney\"}) RETURN p.birthdate\u001b[0m\n",
      "Full Context:\n",
      "\u001b[32;1m\u001b[1;3m[{'p.birthdate': 'December 5, 1901'}]\u001b[0m\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/tomazbratanic/anaconda3/lib/python3.11/site-packages/langchain_core/_api/deprecation.py:189: LangChainDeprecationWarning: The function `__call__` was deprecated in LangChain 0.1.0 and will be removed in 0.2.0. Use invoke instead.\n",
      "  warn_deprecated(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\u001b[1m> Finished chain.\u001b[0m\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'query': 'When was Walter Elias Disney born?',\n",
       " 'result': 'Walter Elias Disney was born on December 5, 1901.'}"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cypher_chain.invoke({\"query\": \"When was Walter Elias Disney born?\"})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "PCyRYN8aQzhU"
   },
   "source": [
    "## Summary\n",
    "Knowledge graphs are a great fit when you need a combination of structured and structured data to power your RAG applications. In this blog post, you have learned how to construct a knowledge graph in Neo4j on an arbitrary text using OpenAI functions. OpenAI functions provide the convenience of neatly structured outputs, making them an ideal fit for extracting structured information. Just make sure you add an entity disambiguation step after the extraction, and you are golden."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "mnBM5jSDNTP7"
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "authorship_tag": "ABX9TyOtPgbncRs178mcTTl511e3",
   "include_colab_link": true,
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
